In this project, I will attempt to predict whether, for any building within a set of identified buildings in Detroit, whether that building will be be targeted for demolition. Potential predictors include citations related to the building, crime, and complaints concerning the building related to blight. I have more-or-less completed the data-cleaning, and I am in the process of associating the various predictors with the buildings. So far I am using about 3GB of data downloaded from https://data.detroitmi.gov/.

library(tidyverse)
library(sf)
library(ggmap)

#recorded violations associated with blight (e.g. unkempt properties)
blight_violations <- read_csv("./data/Blight_Violations_3_19_2018.csv", 
                              guess_max = 10^6)

#read the downloaded file for all the building permits and then filter out the permits for dismantling
dismantle_permits <- read_csv("./data/Building_Permits_3_19_2018.csv", 
                             guess_max = 10^6) %>%
  filter(`Building Permit Type` == "Dismantle")
  
#the files that contain the crime data
crime_to_12062016 <- 
  read_csv("./data/DPD__All_Crime_Incidents__January_1__2009_-_December_6__2016.csv",
           guess_max = 10^6)

crime_12062016_to_03192018 <- 
  read_csv("./data/DPD__All_Crime_Incidents__December_6__2016_-_3_19_2018.csv", 
           guess_max = 10^6)

#the 311 system
improve_detroit_issues <- read_csv("./data/Improve_Detroit_Issues_3_19_2018.csv", 
                                   guess_max = 10^6)

#another file with demolition information downloaded 4/4/2017. I should note that this file appears to cover a somewhat different domain of cases than does dismantle_permits. The difference appears to be that completed_demolitions covers just those buildings that have been dismantled under the Detroit Demolition Program. It may thus be the case that this data omits cases of buildings demolished not because of blight but to make room for something else. 
completed_demolitions <- read_csv("./data/Detroit_Demolitions.csv",
                                  guess_max = 10^6)

upcoming_demolitions <- read_csv("./data/Upcoming_Demolitions.csv")

#the shapefile representing Detroit parcels, read as an sf data frame
parcel_sf <- st_read("./data/Parcel Map")

#convert the parcel number column to a character vector, to match `Parcel Number` in the dismantle permits data
parcel_sf <- parcel_sf %>% mutate(parcelnum = as.character(parcelnum))

#downloaded from http://d3-d3.opendata.arcgis.com/datasets/383eb730952e470389f09617b5448026_0 on 04/13/2018: "Harded Hit Fund Areas, with Expansion"
Hardest_Hit_Fund_Areas <- st_read("./data/Hardest_Hit_Fund_Areas_with_Expansion.kml", 
                                  crs = st_crs(parcel_sf))

city_council_districts <- st_read("./data/City Council Districts")

For all of the downloaded data sets other than the parcels dataset, we extract the usable latitude and longitude values and then use this information to form simple features (sf) objects. Rows with obviously incorrect values, or values that would represent positions well outside Detroit, are filtered out, together with rows for which the latitude or longitude data is missing.

#function for converting the position (character) column into a column of points in the simple features (sf) framework.
add_sf_point <- function(df, column) {
  
  #extract the latitude and longitude from the string column that contains both. With the parentheses
  #located from the end of the strings, it is possible to use the the same function for all five of 
  #the datasets for which we need to extract this information.
  latitude <- str_sub(df[[column]], 
                      stringi::stri_locate_last_fixed(df[[column]], "(")[,2] + 1,
                      stringi::stri_locate_last_fixed(df[[column]], ",")[,1] - 1)
  longitude <- str_sub(df[[column]], 
                       stringi::stri_locate_last_fixed(df[[column]], ", ")[,2] + 1, 
                       stringi::stri_locate_last_fixed(df[[column]], ")")[,1] - 1)
  
  #add the latititude and and longitude to a copy of the dataframe, filter out the NAs from
  #these results, and then convert convert to sf, with point positions indicated in the
  #geometry column
  mutated <- df %>% mutate(extracted_lat = as.double(latitude),
                      extracted_lon = as.double(longitude))
  
  #remove rows with NAs for latitude or longitude, or with values well outside of Detroit
  filtered <- mutated %>%
    filter(!is.na(extracted_lat) & !is.na(extracted_lon)) %>%
    filter(41 < extracted_lat & extracted_lat < 44 & -85 < extracted_lon & extracted_lon < -81)
    
  #create a dataframe from the items that have been filtered out
  result_coord_na <- setdiff(mutated, filtered)
  
  #create sf objects from the rows with usable latitude and longitude information
  result_sf <- st_as_sf(filtered, coords = c("extracted_lon", "extracted_lat"), 
                        crs = st_crs(parcel_sf))
    
  return(list(result_sf, result_coord_na))
}

#apply the function to the seven datasets for which the data was not loaded as a simple features dataframe, thus producing a list of two dataframes for each of the datasets, the first element of the list a simple features data frame and the second element a dataframe with the instances for which it was not possible to convert to simple features
blight_violations_split <- add_sf_point(blight_violations, "Violation Location")

dismantle_permits_split <- add_sf_point(dismantle_permits, "Permit Location")

crime_to_12062016_split <- add_sf_point(crime_to_12062016, "LOCATION")

crime_12062016_to_03192018_split <- add_sf_point(crime_12062016_to_03192018, "Location")

improve_detroit_issues_split <- add_sf_point(improve_detroit_issues, "Location")

completed_demolitions_split <- add_sf_point(completed_demolitions, "Location")

upcoming_demolitions_split <- add_sf_point(upcoming_demolitions, "Location")

We now consider the data for which we do not yet have position data, and complete the information as well as we reasonably can, using the Google API and a function, geocode_pause, that handles some of API’s quirks.

#the portino of the downloaded blight citations data, for which we do not have
blight_vio_na <- blight_violations_split[[2]]

#remove the rows for which geocoding is not likely to prodoce reliable results
useful <- blight_vio_na %>% filter(!is.na(`Violation Street Name`), 
                                   `Violation Street Number` > 0, 
                                   !is.na(`Violation Zip Code`))

#create a column of addresses to be used in geocoding
useful <- useful %>% 
  mutate(complete_address = paste(`Violation Street Number`, " ", `Violation Street Name`, ", ",
                                  "Detroit, Michigan", " ", `Violation Zip Code`, sep = ""))

#function makes a maximum 6 attempts to geocode the given address using the Google API, with a pause of 1 second between attempts. We will use the function for the other datasets as well.
geocode_pause <- function(address) {
  for (index in 1:6) {
    Sys.sleep(1)
    location <- ggmap::geocode(address)
    if (!is.na(location$lon)) {
      return(location)
    }
  }
}
#apply geocode_pause to each of the elements of the complete_addresse column and place the result in a new column, in which each entry is a data frame 
useful <- useful %>% mutate(location = map(complete_address, geocode_pause))

#save to disc, to avoid avoid the need to geocode these addresses again when we rerun the analysis
write_rds(useful, "./data/blight_violations_geocodes.rds")

The geocoding has returned a data frame for each of the addresses. We thus need to unpack the elements of the location column, each of which is a data frame.

#read blight_violations_geocodes as a tibble
blight_violations_geocodes <- read_rds("./data/blight_violations_geocodes.rds")

#function for removing the instances for which geocoding failed (for which the value in the location column is NULLL). We will use this function for all of the geocoded data frames.
remove_null_locations <- function(df) {
  #identify the rows for which the value in the location column is NULL
  null_rows <- list()
  for (index in 1:nrow(df)) {
    if (is.null(df$location[[index]])) {
      null_rows <- c(null_rows, index)
    }
  }
  #remove the rows for which the value of the location column is NULL
  df <- df[-as.integer(null_rows),]
}

blight_violations_geocodes <- remove_null_locations(blight_violations_geocodes)

#With blight_violations_geocodes a tibble, we can apply tidyr::unnest(), which will place the latitude and longitude in columns labelled "lat" and "lon".
blight_violations_geocodes <- blight_violations_geocodes %>% unnest(location)

#fill in the `Violation Latitute` and `Violation Longitude` data frames, which alread exist in the blight_violations data frame
blight_violations_geocodes <- blight_violations_geocodes %>%
  mutate(`Violation Latitude` = lat,
         `Violation Longitude` = lon)

#cut out some columns that have been added
blight_violations_geocodes <- blight_violations_geocodes %>% 
  select(-extracted_lat, -extracted_lon, -complete_address)

#put the position information into a simple features format (which will remove the "lat" and "lon" columns)
blight_violations_geocodes_sf <- st_as_sf(blight_violations_geocodes, 
                                          coords = c("lon", "lat"),
                                          crs = st_crs(parcel_sf))

#combine the results with the previously generated sf data
blight_violations_sf <- rbind(blight_violations_split[[1]], blight_violations_geocodes_sf)

rm(blight_vio_na, blight_violations, blight_violations_geocodes, 
   blight_violations_geocodes_sf, blight_violations_split, useful)
#the dismantle permits for which position data (latitude and longitude) is missing
dismantle_permits_split_na <- dismantle_permits_split[[2]]

#remove the last two columns, which were not contained in the original dismantle_permits datastet
dismantle_permits_split_na <- dismantle_permits_split_na %>% 
  select(-extracted_lat, -extracted_lon)
#geocode the items in dismantle_permits_split_na, using the address column and the function geocode_pause, which makes a maximum of six attempts for each item. The result is list of dataframes in the location column.
dismantle_permits_split_geocode <- dismantle_permits_split_na %>%
  mutate(location = map(str_c(`Site Address`, ", Detroit, Michigan"), geocode_pause))

#write the results of the geocoding to disk, to avoid having to repeat the geocoding when rerunning the analysis.
write_rds(dismantle_permits_split_geocode, "./data/dismantle_permits_geocodes.rds")

rm(dismantle_permits_split_na)
#load the geocoded data frame into R
dismantle_permits_split_geocode <- read_rds("./data/dismantle_permits_geocodes.rds")

#use remove_null_locations() to remove the rows for which geocoding failed and then parse the information in the dataframes in the location column into two new columns, lat and lan
dismantle_permits_split_geocode <- 
  remove_null_locations(dismantle_permits_split_geocode) %>%
  unnest(location)

#convert to a simple features (sf) data frame, using the latititudes and longitudes
dismantle_permits_geocode_sf <- st_as_sf(dismantle_permits_split_geocode, 
                                          coords = c("lon", "lat"),
                                          crs = st_crs(parcel_sf))

#append this simple features dataframe to the dataframe for which we already had usable positions
dismantle_permits_sf <- rbind(dismantle_permits_split[[1]], dismantle_permits_geocode_sf)

rm(dismantle_permits_split_geocode, dismantle_permits_geocode_sf, dismantle_permits, dismantle_permits_split, dismantle_permits_split_na)

We now fill-in the missing position information for the dataset for crimes up to 12-06-2016

#return to the older crime data
crime_to_12062016_leftovers <- crime_to_12062016_split[[2]]

#cut out the addresses that begin with "00"
crime_to_12062016_leftovers <- crime_to_12062016_leftovers %>% 
  filter(str_sub(LOCATION, 1, 2) != "00")

#filter out some obviously useless addresses, with few characters before the first "("
crime_to_12062016_leftovers <- crime_to_12062016_leftovers %>% 
  filter(!str_locate(LOCATION, "\\(")[,1] %in% 1:13)

#remove the two columns that were added earlier
crime_to_12062016_leftovers <- crime_to_12062016_leftovers %>% 
  select(-extracted_lat, -extracted_lon)

#create a column for use in geocoding
crime_to_12062016_leftovers <- crime_to_12062016_leftovers %>% 
  mutate(extracted_address = str_c(str_sub(LOCATION, 1, 
                                           str_locate(LOCATION, "\\(")[,1] - 2),
                                                      ", Detroit, Michigan"))
#geocode the elements of extracted_address, using the function geocode_pause
crime_to_12062016_leftovers_geocode <- crime_to_12062016_leftovers %>% 
  mutate(location = map(extracted_address, geocode_pause))

#save the results, to avoid having to geocode again when rerunning the analysis
write_rds(crime_to_12062016_leftovers_geocode, "./data/crime_to_12062016_leftovers_geocode.rds")
#read the saved results
crime_to_12062016_leftovers_geocode <- read_rds("./data/crime_to_12062016_leftovers_geocode.rds")

#cut out the column we used for geocoding
crime_to_12062016_leftovers_geocode <- 
  crime_to_12062016_leftovers_geocode %>% select(-extracted_address)
  
#cut out of the geocode failures and put the location information into the columns lat and lon
crime_to_12062016_leftovers_geocode <- 
  remove_null_locations(crime_to_12062016_leftovers_geocode) %>%
  unnest(location)

#create a simple features (sf) object, using the latititudes and longitudes
crime_to_12062016_leftovers_sf <- st_as_sf(crime_to_12062016_leftovers_geocode, 
                                          coords = c("lon", "lat"),
                                          crs = st_crs(parcel_sf))

#append this simple features dataframe to the dataframe for which we already had locations
crime_to_12062016_sf <- rbind(crime_to_12062016_split[[1]], crime_to_12062016_leftovers_sf)

rm(crime_to_12062016, crime_to_12062016_leftovers_geocode, crime_to_12062016_leftovers_sf, crime_to_12062016_leftovers, crime_to_12062016_split)
#consider the examples in the recent crime data for which the conversion to sf didn't work, remove the two columns that we have added, and create and address column for geocoding
crime_12062016_to_03192018_leftovers <- crime_12062016_to_03192018_split[[2]] %>%
  select(-extracted_lat, -extracted_lon) %>%
  mutate(extracted_address = str_c(`Incident Address`, ", Detroit, Michigan"))
crime_12062016_to_03192018_geocode <- crime_12062016_to_03192018_leftovers %>% 
  mutate(location = map(extracted_address, geocode_pause))

write_rds(crime_12062016_to_03192018_geocode, "./data/crime_12062016_to_03192018_geocode.rds")
crime_12062016_to_03192018_geocode <- read_rds("./data/crime_12062016_to_03192018_geocode.rds") %>%
  select(-extracted_address)

#remove the rows for which the value of location is NULL and then unnest the remaining locations
crime_12062016_to_03192018_geocode <- 
  remove_null_locations(crime_12062016_to_03192018_geocode) %>%
  unnest(location)

#convert the dataframe to a simple features set
crime_12062016_to_03192018_sf <- st_as_sf(crime_12062016_to_03192018_geocode, 
                                          coords = c("lon", "lat"),
                                          crs = st_crs(parcel_sf))  

#combine the geocoded data with the sf dataframe created earlier
crime_12062016_to_03192018 <- rbind(crime_12062016_to_03192018_split[[1]], crime_12062016_to_03192018_sf)

rm(crime_12062016_to_03192018_split, crime_12062016_to_03192018_geocode, crime_12062016_to_03192018_leftovers, crime_12062016_to_03192018_sf)
#geocode the one item in the Improve Detroit Issues data for which the given coordinates were obviously incorrect, and then convert to an sf object. If geocoding fails, run this bit again
improve_detroit_issues_leftover_sf <- improve_detroit_issues_split[[2]] %>%
  select(-extracted_lat, -extracted_lon) %>%
  mutate(location = map(Address, geocode_pause)) %>%
  unnest(location) %>% 
  st_as_sf(coords = c("lon", "lat"), crs = st_crs(parcel_sf))  
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=14530%20%20Vaughan%20Detroit%2C%20Michigan
#splice with the previously generated sf dataframe
improve_detroit_issues <- rbind(improve_detroit_issues_split[[1]], improve_detroit_issues_leftover_sf)

rm(improve_detroit_issues_split, improve_detroit_issues_leftover_sf)
#create another set of demolition information
completed_demolitions_sf <- completed_demolitions_split[[1]]

#note that location information in completed_demolitions_sf is complete (the dataframe for the examples with complete location information has zero elements)
completed_demolitions_split[[2]]
#another set of demolition information
upcoming_demolitions_sf <- upcoming_demolitions_split[[1]]

upcoming_demolitions_split[[2]]
rm(completed_demolitions, completed_demolitions_split, upcoming_demolitions, upcoming_demolitions_split)

We begin the process of signing labels to the buildings: blighted or not blighted. Buildings will be represented by parcels that have or have had buildings on them, whether by being so represented as in the parcels_sf data frame as including structures or in the dismantle permits dataframe as having had a dismantle permit associated with it, thus suggesting that there was a building on the parcel.

We will use parcel numbers to refer to the parcels. However, as the following bit of code shows, the parcels dataset contains a few rows in which the parcel numbers are the same (duplicate_parcel_numbers_in_parcel_data contains 78 rows).

#introduce a column of row numbers in the parcels data, for use in data cleaning
parcel_sf <- parcel_sf %>%
  mutate(row_num = row_number())

#As per above, following returns a 78-row data frame
duplicate_parcel_numbers_in_parcel_data <- 
  parcel_sf %>%
  group_by(parcelnum) %>% 
  mutate(n = n()) %>%
  ungroup() %>% 
  filter(n > 1) %>%
  select(parcelnum, address, legaldesc, row_num)
duplicate_parcel_numbers_in_parcel_data

Plotting the parcel data suggests that, within groups of parcels that have the same parcel number, some of the parcels cover the same area while others are disjoint. We thus a apply a spatial join, using sf::st_join, to find pairs of parcels that cover the same area. After that, we modify parcel_sf, to be maximal set that contains none of these duplicates.

#join the result with itself, on the basis of sameness of identity of spatial identity 
spatial_repeats <- st_join(duplicate_parcel_numbers_in_parcel_data,
                           duplicate_parcel_numbers_in_parcel_data, 
                           st_equals, left = FALSE) %>% filter(row_num.x != row_num.y)

#select one element from each group with the sf objects for which the polygon covers the same area
selection_vector <- 1:nrow(spatial_repeats)
for (index in 1:nrow(spatial_repeats)) {
  selection_vector[index] <- !(spatial_repeats$row_num.x[index] %in%
                                spatial_repeats$row_num.y[1:index]) 
}
selection_vector <- as.logical(selection_vector)

#the unique parcels, as specified in unique_parcels
unique_parcels <- spatial_repeats[selection_vector,]

#the unique parcels, as specified in parcel_sf, with a row number added to the end of the parcel number (character vector)
unique_parcels <- parcel_sf %>% filter(row_num %in% unique_parcels$row_num.x) %>%
  mutate(parcelnum = str_c(parcelnum, "_", row_number()))

#cut out the set of sf objects that have spatial repeats  
parcel_sf <- parcel_sf %>% filter(!(row_num %in% spatial_repeats$row_num.x))

#bind the the set of unique spatial objects to parcel_sf
parcel_sf <- parcel_sf %>% rbind(unique_parcels)

Having cut out the duplicate parcels (parcels that cover the same area), we plot the groups (as it turns out, pairs) that that have the same parcel number.

library(tidyverse)
library(sf)

repeated_parcel_numbers <- parcel_sf %>% group_by(parcelnum) %>% 
  mutate(n = n()) %>% filter(n > 1) %>% ungroup()
repeated_parcel_numbers <- repeated_parcel_numbers %>% select(parcelnum) %>% 
  mutate(rownumber = row_number()) %>% mutate(plotted = FALSE)

for (row_1 in repeated_parcel_numbers$rownumber) {
  if (repeated_parcel_numbers$plotted[row_1] == FALSE) {
    for (row_2 in repeated_parcel_numbers$rownumber) {
      if (row_2 != row_1 & 
          repeated_parcel_numbers$parcelnum[row_1] == repeated_parcel_numbers$parcelnum[row_2]) {
        print(ggplot(repeated_parcel_numbers %>% filter(rownumber %in% c(row_1, row_2)) %>%
                     select(parcelnum)) + geom_sf() + ggtitle(repeated_parcel_numbers$parcelnum[row_1]))
        repeated_parcel_numbers[c(row_1, row_2),]$plotted<- TRUE 
      }
    }
  }
}

rm(repeated_parcel_numbers)

Having verified that the repeats of the parcel numbers represent distinct parcels, modify the parcel numbers to make them distinct.

#add iteration numbers to parcelnum (a character variable) in the repeats within each set of repeasts
parcel_sf <- parcel_sf %>% group_by(parcelnum) %>% mutate(n = n(), repeat_number = row_number()) %>% 
  ungroup() %>%
  mutate(parcelnum = ifelse(n > 1, str_c(parcelnum, "_", repeat_number), parcelnum)) %>%
  select(-n, -repeat_number)

#test for repeats
temp <- parcel_sf %>% as.data.frame() %>% select(-geometry) %>% 
  group_by(parcelnum) %>% mutate(n = n()) %>% filter(n > 1)
temp

We now now use sf::st_join, with st_overlaps, to test for parcel overlap, with the result, according to the test, that there are several thousand pairs of parcels that overlap (share at least some portion of interior area).

#check for overlap of the parcels in parcel_sf
spatial_overlaps <- st_join(parcel_sf, parcel_sf, 
        st_overlaps, left = FALSE) %>% filter(row_num.x != row_num.y)
## although coordinates are longitude/latitude, st_overlaps assumes that they are planar
nrow(spatial_overlaps)
## [1] 9194

Investigating the cases of overlap (according to st_overlaps) with plots of random selections of the supposedly overlapping pairs suggests that the apparent overlap is not significant (see below). It may be due to the fact that st_join treats latitude and longitude values as planar coordinates. I will assume that parcels do not overlap.

#select a random sample of these cases of two parcels that overlap
set.seed(55) 
sample <- sample(1:nrow(spatial_overlaps), 20)
parcel_selection <- spatial_overlaps[sample,]

#plot the elements of the random sample of pairs for which st_overlaps had indicated an overlap
for (index in 1:nrow(parcel_selection)) {
  row_1 <- parcel_sf %>% select(parcelnum) %>% 
    filter(parcelnum == parcel_selection$parcelnum.x[index])
  row_2 <- parcel_sf %>% select(parcelnum) %>% 
    filter(parcelnum == parcel_selection$parcelnum.y[index])
  print(ggplot(rbind(row_1, row_2)) + geom_sf(aes(fill = parcelnum)))
}

rm(duplicate_parcel_numbers_in_parcel_data, selection_vector, unique_parcels, row_1, row_2, spatial_repeats, index, sample, parcel_selection, spatial_overlaps)
#(note that eval is set to false, to avoid rerunning this code every time I implement "run all chunks")

#investagate a few of the cases by road map 
raod_map <- function(sf_shape) {
  df <- tibble(longitude = st_coordinates(sf_shape)[,1],
               latitude = st_coordinates(sf_shape)[,2])
  
  #left/bottom/right/top for bounding box
  bounding_box <- c(min(df$longitude) - 0.002, min(df$latitude) - 0.002, 
                    max(df$longitude) + 0.002, max(df$latitude) + 0.002)
  detroit_gg <- get_map(location = bounding_box, 
                        maptype = "roadmap")
  ggmap(detroit_gg)
}

raod_map(parcel_sf %>% filter(parcelnum == "13000116.003"))

#investigate a few of the cases by satelite map
satellite_map <- function(sf_shape) {
  df <- tibble(longitude = st_coordinates(sf_shape)[,1],
               latitude = st_coordinates(sf_shape)[,2])
  
  #left/bottom/right/top for bounding box
  bounding_box <- c(min(df$longitude) - 0.002, min(df$latitude) - 0.002, 
                    max(df$longitude) + 0.002, max(df$latitude) + 0.002)
  detroit_gg <- get_map(location = bounding_box, 
                        maptype = "satellite")
  ggmap(detroit_gg)
}

satellite_map(parcel_sf %>% filter(parcelnum == "13000116.003"))

We begin the process of creating a list of buildings from the parcels data. One of the issues to consider is number of buildings on the parcel. Should we restrict our analysis to those parcels that have only one building? In any case, two columns in parcel_sf bear on this aspect: building_s and num_buildi.

#exploration of the numbers of buildings on parcels
buildings_1 <- parcel_sf %>% filter(!is.na(building_s))
levels(buildings_1$building_s)
##  [1] "1/2 DUPLEX"      "2 STY COLONIAL"  "APARTMENT"      
##  [4] "APT FLAT GARDEN" "APT HIGH RISE"   "CARRIAGE HOUSE" 
##  [7] "DUPLEX"          "FOUR FAMILY"     "GARAGE/SHED"    
## [10] "INCOME BUNGALOW" "LARGE FLATS"     "LOFT APT WALKUP"
## [13] "MULTI DWELLING"  "ROW HOUSE"       "SINGLE FAMILY"  
## [16] "TWO FAMILY FLAT"
buildings_2 <- parcel_sf %>% filter(num_buildi > 0)

buildings_3 <- parcel_sf %>% filter(!is.na(building_s) & num_buildi > 0)

buildings_4 <- parcel_sf %>% filter(is.na(building_s) & num_buildi > 0)
nrow(buildings_4)
## [1] 18749
buildings_5 <- parcel_sf %>% filter(!is.na(building_s) & !num_buildi > 0)

#print, but first cut out the geometry column to avoid an error message.
as.data.frame(buildings_5) %>% select(-geometry)

building_5 contains zero rows, thus indicating that buildings_1 is a subset of building_2. On the other hand, buildings_4–the set of parcels with, according to the data, at least one building but with the building-type unspecified (NA in building_s) contains 18,749 rows.

buildings_6 <- parcel_sf %>% filter(num_buildi > 1)

ggplot(buildings_1) + geom_bar(aes(x = as.factor(num_buildi)))

ggplot(buildings_4) + geom_bar(aes(x = as.factor(num_buildi)))

ggplot(buildings_6) + geom_bar(aes(x = building_s))

related <- buildings_1 %>% filter(!is.na(related_pa))
set.seed(27)
sample <- sample_n(related, 20)
dfs <- list()
for (index in 1:nrow(sample)) {
  row_1 <- sample[index,]
  row_2 <- parcel_sf %>% filter(parcelnum == sample$related_pa[index])
  df <- rbind(row_1, row_2)
  print(ggplot(df) + geom_sf(aes(fill = parcelnum)))
  dfs[[index]] <- df
}

as.data.frame(dfs[[2]])
rm(buildings_1,buildings_2, buildings_3, buildings_4, buildings_5, buildings_6)

rm(row_1, row_2, sample, related, dfs, df)

I will ignore the “related parcels” (related_pa) in parcel_sf. Before making the final call as to what subset of the parcels dataframe will provide the stand-in for buildings, I will work on the data that we will use to form the labels.

Like the parcel_sf dataframe, the dismantle permits data also contains some duplicate parcel numbers over rows, in some cases over rows that contain address information suggesting that the location is different. (It also indicates that a few individual locations, identified with addresses, had more than one associated dismantle permit. This need not be problematic—a permit could expire before the work is carried out, or there could be more than one structure on a parcel.) These repetitions are in duplicate_parcel_numbers_over_distinct_addresses, which contains 272 rows. I should also note that, in most of these cases with duplicate parcel numbers (and addresses indicating different locations), the recorded latitude and longitude are identical. We can thus infer that some of this location information is incorrect.

#repeated parcel numbers in the dismantle permits data
dup_par_num_in_dismantle_data <-
  dismantle_permits_sf %>%
  group_by(`Parcel Number`) %>% 
  mutate(n = n()) %>%
  ungroup() %>% 
  filter(n > 1) %>%
  select(`Parcel Number`, `Site Address`) %>%
  arrange(`Parcel Number`)
rm(dup_par_num_in_dismantle_data)

#parcel numbers in the dismantle permits data that are distributed over disinct address strings
dup_par_num_over_distinct_addresses <- 
  dismantle_permits_sf %>%
  group_by(`Parcel Number`) %>%
  mutate(parcel_number_occurances = n()) %>%
  ungroup() %>%
  filter(parcel_number_occurances > 1) %>%
  group_by(`Parcel Number`, `Site Address`) %>%
  mutate(m = n()) %>%
  filter(m < parcel_number_occurances) %>%
  arrange(`Parcel Number`)
#remove extraneous variables and then geocode the dismantle site addresses
geocoded_duplicates <- dup_par_num_over_distinct_addresses %>%
  as.data.frame %>%
  select(-geometry) %>%
  mutate(location = map(str_c(`Site Address`, ", Detroit, Michigan"), geocode_pause))
  
#avoid having to do the geocoding again when re-running the code
write_rds(geocoded_duplicates, "./data/geocoded_duplicates.rds")  

Let’s have a look at the relationship between the numbers of buildings on the parcels, as indicated in parcel_sf, and the numbers of times parcels occur in dup_par_num_over_distinct_addresses (which was constructed from the dismantle permits data)

#read the saved geocoded file into R
geocoded_duplicates <- read_rds("./data/geocoded_duplicates.rds")

#unpack the column of dataframes created by the geocoding, remove rows with identical geocoding results, and convert the resulting dataframe into a set of sf objects
geocoded_duplicates <- remove_null_locations(geocoded_duplicates) %>% 
  unnest(location) %>% distinct(lon, lat, .keep_all = TRUE) %>%
  st_as_sf(coords = c("lon", "lat"), crs = st_crs(parcel_sf))

#ivestigate the relatinship between number buildings on a parcel, as specified in parcel_sf, and number of occurrences of parcel numbers in geocoded_duplicates
joined_w_parcels_sf <- as.data.frame(geocoded_duplicates) %>% 
  group_by(`Parcel Number`) %>% 
  summarize(`number of locations` = n()) %>%
  inner_join(as.data.frame(parcel_sf), by = c("Parcel Number" = "parcelnum")) 
ggplot(joined_w_parcels_sf) +
  geom_count(aes(x = as.factor(`number of locations`),
                 y = num_buildi))

#cut out the greater values of num_buildi, so as to widen the vertical scale
ggplot(joined_w_parcels_sf %>% filter(num_buildi < 11)) +
  geom_count(aes(x = as.factor(`number of locations`),
                 y = as.factor(num_buildi)))

Note the large number of cases for which num_buildi is equal to zero. If the data is accurate, then it is a reasonable guess that such cases reflect parcels for which num_build has been updated after all buildings have have been removed.

temp3 <- geocoded_duplicates %>% #select(`Parcel Number`, location, `Permit Location`) %>%
  filter(`Parcel Number` == "13006809.")

#function for putting a set of of sf point on a satelite map
sf_points_map <- function(sf_df) {
  df <- sf_df %>% mutate(longitude = st_coordinates(sf_df)[,1],
                                latitude = st_coordinates(sf_df)[,2]) %>%
  as.data.frame %>% select(longitude, latitude)
  
  #left/bottom/right/top for bounding box
  bounding_box <- c(min(df$longitude) - 0.002, min(df$latitude) - 0.002, 
                    max(df$longitude) + 0.002, max(df$latitude) + 0.002)
  detroit_gg <- get_map(location = bounding_box, 
                        maptype = "satellite")
  ggmap(detroit_gg) + geom_point(data = df, aes(x = longitude, y = latitude),
                                 color = "red") 
}

#map of points with Parcel Number listed as "13006809."
sf_points_map(temp3)
## converting bounding box to center/zoom specification. (experimental)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=42.410747,-83.043346&zoom=17&size=640x640&scale=2&maptype=satellite&language=en-EN

#note that "13006809." is not reflected as a parcel number in parcel_sf (noting that parcel_sf$parcelnum is an integer vector).
nrow(parcel_sf %>% filter(parcelnum == "13006809."))
## [1] 0
#spacial join to see where these points in the dismantled dataset hook up with the parcels dataset, with the result that the parcel contains three of seven points with `Parcel Number` = "13006809.". The are nearby, although some of may be closer to other parcels.
temp4 <- st_join(parcel_sf, temp3, left = FALSE, st_contains)
## although coordinates are longitude/latitude, st_contains assumes that they are planar
temp4$`Parcel Number`
## [1] "13006809." "13006809." "13006809."
as.numeric(as.character(temp4$parcelnum))
## [1] 13006809 13006809 13006809
temp5 <- parcel_sf %>%
  filter((!as.numeric(as.character(parcelnum)) < 13006809) &
          as.numeric(as.character(parcelnum)) < 13006810) %>%
  select(parcelnum)
as.data.frame(temp5$parcelnum)
#plot with parcel "13006809." on a satelite map
sf_points_map(temp3) +  
  geom_sf(data = temp5 %>% select(geometry), crs = 3857, inherit.aes = FALSE)
## converting bounding box to center/zoom specification. (experimental)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=42.410747,-83.043346&zoom=17&size=640x640&scale=2&maptype=satellite&language=en-EN
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

rm(temp3, temp4, temp5, joined_w_parcels_sf, geocoded_duplicates)
rm(demolition_sample, geocoded_duplicates)
rm(temp, temp2)

It is notable that one of the positions associated with the dismantle permits is far enough from the the two parcels to be potentially closer to another parcel, which (upon further investigation with Google maps) appears to be adjacent to a vacant lot, on which a building, now dismantled, may have sat.

Further investigation of Parcel Number duplicates in the dismantle permits data:

#read in the saved set of examples in the dismantle permits data that involve repeats of parcel numbers over different address strings.
dismantle_duplicates_geocoded <- read_rds("./data/geocoded_duplicates.rds")

#unpack the column of data frames (the outputs of geocode())
dismantle_duplicates_geocoded <- remove_null_locations(dismantle_duplicates_geocoded) %>%
   unnest(location)

#note the 16 rows of dismantle_duplicates_geocoded for which another row lists the same longitude and latitude
dismantle_duplicates_geocoded %>% 
  group_by(lon, lat) %>% mutate(n = n()) %>% filter(n > 1)
#keep only the first of any group of rows with the same longitude and latitude
dismantle_duplicates_geocoded <-
  dismantle_duplicates_geocoded %>% distinct(lon, lat, .keep_all = TRUE)

head(dismantle_duplicates_geocoded)
#manually cut out the one remaining apparent duplicate location
dismantle_duplicates_geocoded <- dismantle_duplicates_geocoded %>% 
  filter(!(`Site Address` == "3200 E LAFAYETTE-MARTIN LUTHER KING HIGH"))

dismantle_duplicates_geocoded <- dismantle_duplicates_geocoded %>%
  st_as_sf(coords = c("lon", "lat"), crs = st_crs(parcel_sf)) %>%
  select(-parcel_number_occurances, -m)

#where possible, find the parcels that contain the positions corresponding to the coordinates
spacial_join_within <- 
  st_join(dismantle_duplicates_geocoded, 
          parcel_sf %>% select(parcelnum), 
          st_within)
## although coordinates are longitude/latitude, st_within assumes that they are planar
#change the parcel numbers (for the dismantle permits data) to those for the parcels that contain the parcels specified by the coordinates. remove the parcelnum column (from the parcel_sf data frame)
dismantle_duplicates_geocoded <- spacial_join_within %>%
  mutate(`Parcel Number` = ifelse(!is.na(parcelnum),
                                  as.character(parcelnum),
                                  `Parcel Number`)) %>%
  select(-parcelnum)

#replace rows with duplicates of parcel numbers over distinct addresses with the cleaned set of rows, as per above
dismantle_permits_sf <- dismantle_permits_sf %>% 
  filter(!(`Parcel Number` %in% dup_par_num_over_distinct_addresses$`Parcel Number`)) %>%
  rbind(dismantle_duplicates_geocoded)

rm(dismantle_duplicates_geocoded, dup_par_num_over_distinct_addresses, spacial_join_within)
rm(index)

Using Google Street View to investigate some more dismantle permit entries for which there are other entries with the same parcel number but different addresses, we find mostly different locations at the same building (e.g. the two sides of a duplex), or perhaps two attached buildings.

Continuing with the matter of assigning labels to the buildings, we now look at the completed demolitions data, completed_demolitions_sf. Although the initial idea was to use the dismantle permits data to assign the labels—blighted (to be dismantled) and not blighted (not to be dismantled)—we should consider using the completed demolitions data instead or in addition. Although this data may omit blighted buildings which, for whatever reason, have not been demolished or which, as suggested on the City-of-Detroit web-page from which the file was downloaded, may have been demolished in a hurry, the data appears to be relatively clean and genuinely reflective and blightedness. For example, if a new property owner dismantled a well-kept building for some reason, say to build a larger home, then, it may may seem, the demolition of this building would be reflected in the dismantle-permits dataset but not in the completed-demolitions dataset.

#look for repeats of "Parcel ID" on completed_demolitions_sf, finding none
as.data.frame(completed_demolitions_sf) %>%
  select(-geometry) %>%
  group_by(`Parcel ID`) %>%
  mutate(n = n()) %>%
  filter(n > 1)
#check to see how this data joins with the parcels data, noting that the inner join contains exactly seven rows less than completed_demolitions_sf contains
demolition_join_on_parcels <- as.data.frame(parcel_sf) %>% filter(!is.na(parcelnum)) %>%
  inner_join(as.data.frame(completed_demolitions_sf %>% filter(!is.na(`Parcel ID`))),
                                         by = c("parcelnum" = "Parcel ID"))

#look for repeats of parcelnum in demolition_join_on_parcels, finding none. we thus find near total agreement between the parcel_sf data frame and the completed demolitions data frame--given the information in parcel_sf, the parcel numbers in the completed demolitions dataset appear to be consistent with the location data.
as.data.frame(demolition_join_on_parcels) %>%
  select(-geometry.x, -geometry.y) %>%
  group_by(parcelnum) %>%
  mutate(n = n()) %>%
  filter(n > 1)
#have a look at the seven rows of completed_demolitions for which the `Parcel ID` did not match with a value of parcelnum in parcel_sf
dismantled_parcels_left_out <- completed_demolitions_sf %>% 
  filter(!`Parcel ID` %in% demolition_join_on_parcels$parcelnum)

#link these to the parcels data set, parcel_sf, using a spacial join
dismantled_left_out_joined <- dismantled_parcels_left_out %>%
  st_join(parcel_sf %>% select(geometry), st_covered_by, left = FALSE)
## although coordinates are longitude/latitude, st_covered_by assumes that they are planar
rm(demolition_sample, demolition_join_on_parcels, dismantled_left_out_joined, dismantled_parcels_left_out)
## Warning in rm(demolition_sample, demolition_join_on_parcels,
## dismantled_left_out_joined, : object 'demolition_sample' not found

In an effort to make our predictors propertly relevant to what they are predicting, we will restrict our labels to the period from June of 2016. We thus need to convert some of the date information, which is in the form of a string, into a proper date format. Although three datasets are being used to the construct our labels, the conversion will be necessary for only two of the datasets.

require(lubridate)
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
completed_demolitions_sf$`Demolition Date` <- mdy(completed_demolitions_sf$`Demolition Date`)

dismantle_permits_sf$`Permit Issued` <- mdy(dismantle_permits_sf$`Permit Issued`)

Our parcel list, the standins for buildings, will be formed from those parcels that satisfy one or more of the following conditions:

Having determined that the great majority of the parcels have only one building, we may guess that that the existence of a few parcels with repeats won’t have much of a distorting affect.

After construction of a list of parcels, we proceed to a set of labels—blighted or not blighted—for each parcel, based on the the building’s either having been dismantled or having had a dismantle permit associated with it. In associating dismantle permits and actual demolitions with individual parcels, I have prioritized spatial association (using sf::st_join) over matching of parcel numbers. (That is, in the limited number of cases where these methods of association disagree, we use the spatial association.)

#the set of all rows of parcel_sf that are within one of the hardest hit areas
parcels_in_hardest_hit_areas <- parcel_sf %>%
  st_join(Hardest_Hit_Fund_Areas, st_within, left = FALSE)
## although coordinates are longitude/latitude, st_within assumes that they are planar
#add a row number to completed_demolitions_sf, so as to keep an account of which rows have been included in the following joins
completed_demolitions_sf <- completed_demolitions_sf %>% mutate(rownumber = row_number())

#inner spacial join of parcels_in_hardest_hit_areas and completed_demolitions_sf, on the basis containment of the point associated with the demolition dataset, in the parcel 
completed_in_fund_areas_parcel_spatial <- parcels_in_hardest_hit_areas %>%
  st_join(completed_demolitions_sf, st_contains, left = FALSE)
## although coordinates are longitude/latitude, st_contains assumes that they are planar
completed_in_fund_areas_parcel_spatial <- completed_in_fund_areas_parcel_spatial %>%
  filter(`Demolition Date` > mdy("05/31/2016"))

#inner join of parcels_in_hardest_hit_areas and completed_demolitions_sf, on the basis of identify of parcel numbers, for those elements of completed_demolitions_sf that were not captured in the spatial join
completed_in_fund_areas_parcelnum_join <- parcels_in_hardest_hit_areas %>%
  inner_join(as.data.frame(completed_demolitions_sf %>% 
                             filter(! rownumber %in% completed_in_fund_areas_parcel_spatial$rownumber)), 
             by = c("parcelnum" = "Parcel ID"))

completed_in_fund_areas_parcelnum_join <- completed_in_fund_areas_parcelnum_join %>%
  filter(`Demolition Date` > mdy("05/31/2016"))

#add a row number to completed_demolitions_sf, so as to keep an account of which rows have been included in the following joins
upcoming_demolitions_sf <- upcoming_demolitions_sf %>% mutate(rownumber = row_number())

#inner spacial join of parcels_in_hardest_hit_areas and completed_demolitions_sf, on the basis containment of the point associated with the demolition dataset, in the parcel 
upcoming_in_fund_areas_parcel_spatial <- parcels_in_hardest_hit_areas %>%
  st_join(upcoming_demolitions_sf, st_contains, left = FALSE)
## although coordinates are longitude/latitude, st_contains assumes that they are planar
#inner join of parcels_in_hardest_hit_areas and completed_demolitions_sf, on the basis of identify of parcel numbers, for those elements of completed_demolitions_sf that were not captured in the spatial join
upcoming_in_fund_areas_parcelnum_join <- parcels_in_hardest_hit_areas %>%
  inner_join(as.data.frame(upcoming_demolitions_sf %>% 
                             filter(! rownumber %in% completed_in_fund_areas_parcel_spatial$rownumber)), 
             by = c("parcelnum" = "Parcel ID"))

#repeat the above three steps for the parcels that have dismantle permits associated with them
dismantle_permits_sf <- dismantle_permits_sf %>% mutate(rownumber = row_number())
  
dismantle_permit_fund_areas_spatial <- parcels_in_hardest_hit_areas %>%
  st_join(dismantle_permits_sf, st_contains, left = FALSE)
## although coordinates are longitude/latitude, st_contains assumes that they are planar
dismantle_permit_fund_areas_spatial <- dismantle_permit_fund_areas_spatial %>%
  filter(`Permit Issued` > mdy("05/31/2016"))

dismantle_permit_fund_areas_parcelnum_join <- parcels_in_hardest_hit_areas %>%
  inner_join(as.data.frame(dismantle_permits_sf %>%
                             filter(! rownumber %in% dismantle_permit_fund_areas_spatial$rownumber)), 
             by = c("parcelnum" = "Parcel Number"))

dismantle_permit_fund_areas_parcelnum_join <- dismantle_permit_fund_areas_parcelnum_join %>%
  filter(`Permit Issued` > mdy("05/31/2016"))

#data frame with the parcel numbers of all parcels that have been at least one dismantled building or dismantle permit associated with it.
blighted_parcels <- rbind(as.data.frame(completed_in_fund_areas_parcel_spatial) %>% select(parcelnum), 
                          as.data.frame(completed_in_fund_areas_parcelnum_join) %>% select(parcelnum),
                          as.data.frame(dismantle_permit_fund_areas_spatial) %>% select(parcelnum),
                          as.data.frame(dismantle_permit_fund_areas_parcelnum_join) %>% select(parcelnum),
                          as.data.frame(upcoming_in_fund_areas_parcel_spatial) %>% select(parcelnum), 
                          as.data.frame(upcoming_in_fund_areas_parcelnum_join) %>% select(parcelnum)) %>%
  distinct(parcelnum)

#data frame with the parcel numbers for all parcels in the hardest hit areas that have or have had at least one buiding on the parcel
parcels_set <- as.data.frame(blighted_parcels) %>% 
  select(parcelnum) %>%
  rbind(as.data.frame(parcels_in_hardest_hit_areas) %>% 
          filter(num_buildi > 0) %>% select(parcelnum)) %>%
  distinct(parcelnum)

#our labels
labels <- parcels_set %>% mutate(blighted = ifelse(parcelnum %in% blighted_parcels$parcelnum, 1, 0))

#note the ballance of blighted versus not-blighted labels.
labels %>% count(blighted) %>% mutate(fraction = n / sum(n))
rm(completed_in_fund_areas_parcel_spatial, completed_in_fund_areas_parcelnum_join, dismantle_permit_fund_areas_spatial, dismantle_permit_fund_areas_parcelnum_join, parcels_set, blighted_parcels, dismantle_permit_fund_areas_parcelnum_join, dismantle_permit_fund_areas_spatial, completed_in_fund_areas_parcelnum_join, completed_in_fund_areas_parcel_spatial, parcels_in_hardest_hit_areasparcels_in_hardest_hit_areasparcels_in_hardest_hit_areas, upcoming_in_fund_areas_parcel_spatial, upcoming_in_fund_areas_parcelnum_join)

We now construct a balanced set of labels, with roughly equal numbers of blighted and non-blighted examples. We then divide the set of parcel numbers into training and test sets, on a ratio of 0.8 to 0.2. Note that we will be using 10-fold cross validation on the training set, with the 20 percent of all of the items set asside for a final test of our model.

#note the ballance of blighted versus not-blighted labels.
labels %>% count(blighted) %>% mutate(fraction = n / sum(n))
labels_balanced <- labels %>% filter(blighted == 0) %>% sample_n(2200) %>%
  rbind(labels %>% filter(blighted == 1))

set.seed(82)
training_rows <- caret::createDataPartition(labels_balanced$blighted, p = 0.85, list = FALSE)
training_parcelnums <- labels_balanced[training_rows,]$parcelnum
testing_parcelnums <- labels_balanced[-training_rows,]$parcelnum

Having explored and cleaned the data and constructed a set of labels, we proceed to the construction of predictive models. The first step will be to add some properties to the set of labels.

#add the parcel geometry to the labels
parcels_with_labels <- parcel_sf %>% select(parcelnum) %>%
  inner_join(labels_balanced, by = "parcelnum")

#check to see that there are no parcel number repeats (finding none)
parcels_with_labels %>% as.data.frame() %>% select(-geometry) %>% count(parcelnum) %>% filter(n > 1)
#note that `Ticket ID` provides a key for blight_violations_sf (temp has zero elements)
temp <- blight_violations_sf %>% as.data.frame() %>% select(-geometry) %>%
  count(`Ticket ID`) %>% filter(n > 1)
temp
rm(temp)

head(blight_violations_sf) 
#have a look at the distribution of violators recorded in blight_violations_sf for (as indicated by `Violator ID`), noting that the following construction contains zero rows (Violator ID thus doesn't seem to be useful)
violator_distribution <- blight_violations_sf %>% 
  as.data.frame() %>% select(-geometry) %>% count(`Violator ID`) %>% filter(n > 1)
violator_distribution
rm(violator_distribution)

#associate the blight violations data with parcels_with_labels, first with a spatial join and then, for any rows in blight_violations_sf for which the spatial association fails, with an inner join on the parcel numbers. recall that parcels_with_labels is restrcicted to the hardest hit areas
spatial_join <- parcels_with_labels %>% 
  st_join(blight_violations_sf, st_contains, left = FALSE) %>%
  select(-`Violation Parcel ID`)
## although coordinates are longitude/latitude, st_contains assumes that they are planar
#make use of the key `Ticket ID` to find the leftovers
blight_violations_leftovers <- blight_violations_sf %>% filter(! `Ticket ID` %in% spatial_join$`Ticket ID`)

#now handle the leftovers
parcelnum_join <- parcels_with_labels %>% 
  inner_join((as.data.frame(blight_violations_leftovers) %>% select(-geometry)),
             by = c("parcelnum" = "Violation Parcel ID"))

blight_violation_data <- rbind(spatial_join, parcelnum_join)

blight_violation_data <- inner_join(parcels_with_labels, 
                                     as.data.frame(blight_violation_data) %>% select(-geometry, -blighted), 
                                     by = "parcelnum")

rm(spatial_join, blight_violations_leftovers, parcelnum_join)

We consider a histogram of the blight data

#dataset with two calculated variables--total number of blight violations and tot4 perhaps to be used as predictors
blight_violation_tallies <- blight_violation_data %>% as.data.frame() %>% select(-geometry) %>%
  mutate(`Fine Amount` = as.numeric(str_sub(`Fine Amount`, start = 2))) %>%
  group_by(parcelnum, blighted) %>%
  summarize(number_of_violations_by_parcel = n(), total_fines_by_parcel = sum(`Fine Amount`))

blight_violation_tallies <- as.tibble(blight_violation_tallies)

parcels_without_blight_violations <- labels_balanced %>% 
  filter(!parcelnum %in% blight_violation_tallies$parcelnum) %>%
  mutate(number_of_violations_by_parcel = 0, total_fines_by_parcel = 0)

blight_violation_tallies <- bind_rows(blight_violation_tallies, parcels_without_blight_violations)

blight_violation_summary <- blight_violation_tallies %>% 
  filter(number_of_violations_by_parcel < 19) %>%
  group_by(number_of_violations_by_parcel, blighted) %>% summarize(n = n()) 

ggplot(blight_violation_summary) + 
  geom_col(aes(x = number_of_violations_by_parcel,
               y = n, 
               fill = as.factor(blighted)),
           position = "dodge2")

rm(blight_violation_summary)  

Now consider a histogram of the fine totals.

head(blight_violation_tallies)
blight_violation_tallies %>% filter(total_fines_by_parcel < 5000) %>% 
  ggplot(aes(x = total_fines_by_parcel, fill = as.factor(blighted))) + 
  geom_histogram(binwidth = 200)

We will next construct a simple logistic model, using the constructed dataframe that we used for the two plots above. The two predictors will be the number of blight violaitons and the total amount in fines.

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- blight_violation_tallies %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

models <- 1:10 %>% map(~ glm(blighted ~ total_fines_by_parcel + number_of_violations_by_parcel,
                             data = train[-k_folds[[.x]],], family = "binomial"))

predictions <- 1:10 %>% map(~ predict.glm(models[[.x]], newdata = train[k_folds[[.x]],],
                                      type = "response"))
accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.590493
sd(as.numeric(accuracies))
## [1] 0.03531362
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 144 105
##   TRUE   47  68
##        truth
## pred      0   1
##   FALSE 145 119
##   TRUE   32  68
##        truth
## pred      0   1
##   FALSE 156 112
##   TRUE   32  65
##        truth
## pred      0   1
##   FALSE 162 118
##   TRUE   20  64
##        truth
## pred      0   1
##   FALSE 154 129
##   TRUE   31  50
##        truth
## pred      0   1
##   FALSE 145 117
##   TRUE   46  56
##        truth
## pred      0   1
##   FALSE 170 114
##   TRUE   20  60
##        truth
## pred      0   1
##   FALSE 166 104
##   TRUE   28  66
##        truth
## pred      0   1
##   FALSE 139 133
##   TRUE   38  54
##        truth
## pred      0   1
##   FALSE 150 110
##   TRUE   36  68
#construct a grid of predictor values and then represent the predictions with colors
library(modelr)
number_of_violations_by_parcel <- 
  seq_range(train$number_of_violations_by_parcel, n = 10, pretty = TRUE, trim = 0.05)
total_fines_by_parcel <- seq_range(train$total_fines_by_parcel, n = 10, pretty = TRUE, trim = 0.05)

grid <- crossing(number_of_violations_by_parcel,
                 total_fines_by_parcel)

#have a look at the third model (among the k-fold validation models)
grid <- grid %>% add_predictions(models[[3]])

ggplot(grid) + geom_point(aes(x = number_of_violations_by_parcel, 
                              y = total_fines_by_parcel, color = pred > 0.5))

Note that the models are much better at predicting negative instances (truth = 0) instances than predicting positive instances (true = 1). (It gets most of the positive instances wrong.)

We now compare this with a simple rpart (decision tree) model

library(rpart)

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- blight_violation_tallies %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

models <- 1:10 %>% map(~ rpart(blighted ~ total_fines_by_parcel + number_of_violations_by_parcel,
                             data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6028534
sd(as.numeric(accuracies))
## [1] 0.02033948
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 143 101
##   TRUE   43  77
##        truth
## pred      0   1
##   FALSE 137  93
##   TRUE   49  85
##        truth
## pred      0   1
##   FALSE 118  72
##   TRUE   69 106
##        truth
## pred      0   1
##   FALSE 122  74
##   TRUE   64 104
##        truth
## pred      0   1
##   FALSE 117  67
##   TRUE   69 111
##        truth
## pred      0   1
##   FALSE 128  94
##   TRUE   58  84
##        truth
## pred      0   1
##   FALSE 115  77
##   TRUE   71 101
##        truth
## pred      0   1
##   FALSE 114  66
##   TRUE   72 112
##        truth
## pred      0   1
##   FALSE 142 116
##   TRUE   44  62
##        truth
## pred      0   1
##   FALSE 113  74
##   TRUE   73 104
#construct a grid of predictor values and then represent the predictions with colors
library(modelr)
number_of_violations_by_parcel <- 
  seq_range(train$number_of_violations_by_parcel, n = 10, pretty = TRUE, trim = 0.05)
total_fines_by_parcel <- seq_range(train$total_fines_by_parcel, n = 10, pretty = TRUE, trim = 0.05)

grid <- crossing(number_of_violations_by_parcel,
                 total_fines_by_parcel)

#have a look at the third model (among the k-fold validation models)
grid <- grid %>% add_predictions(models[[9]])

#plot of the predictions, for one of the models, 
ggplot(grid) + geom_point(aes(x = number_of_violations_by_parcel, 
                              y = total_fines_by_parcel, color = pred[,2] > 0.5))

models[[4]]
## n= 3277 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 3277 1602 0 (0.5111382 0.4888618)  
##   2) total_fines_by_parcel< 35 1730  659 0 (0.6190751 0.3809249) *
##   3) total_fines_by_parcel>=35 1547  604 1 (0.3904331 0.6095669) *

Note that the simple decision-tree model is somewhat better than the logistic model at teasing out the positive instances, and somewhat less effective wit respect to the negative instances..

We now add that calls to the improve_detroit_issues dataset to the model. Note that the improve detroit issues dataset does not contain data for 2013 or earlier.

#note the possible issue types, and then restrict the set to include issues that would seem to be associated with blight in a particular building
levels(as.factor(improve_detroit_issues$`Issue Type`))
##  [1] "Blocked Catch Basin"                          
##  [2] "Cemetery Issue"                               
##  [3] "Clogged Drain"                                
##  [4] "Curbside Solid Waste Issue"                   
##  [5] "Dead Animal Removal"                          
##  [6] "DPW - Debris Removal - DPW USE ONLY"          
##  [7] "DPW - Other environmental - DPW USE ONLY"     
##  [8] "Fire Hydrant Issue"                           
##  [9] "Illegal Dump Sites"                           
## [10] "Manhole Cover Issue"                          
## [11] "New LED Street Light Out"                     
## [12] "Other - Not within City jurisdiction"         
## [13] "Other - Not within scope of City services"    
## [14] "Other - Referred to other City Department"    
## [15] "Park Issue"                                   
## [16] "Potholes"                                     
## [17] "Residential Snow Removal Issue"               
## [18] "Rodent Extermination - BSEED Only"            
## [19] "Running Water in a Home or Building"          
## [20] "Squatters Issue"                              
## [21] "Squatters Issue - TEST ONLY"                  
## [22] "Street Light / Street Light Pole Major Repair"
## [23] "Street Light Out"                             
## [24] "Street Light Pole Down"                       
## [25] "Traffic Sign Issue"                           
## [26] "Traffic Signal Issue"                         
## [27] "Tree Issue"                                   
## [28] "Water Main Break"
#we'll try to work with the folloiwng dataset, which has 
improve_detroit_issues_property <- improve_detroit_issues %>%
  filter(`Issue Type` %in% c("Residential Snow Removal Issue", "Running Water in a Home or Building", 
                             "Curbside Solid Waste Issue",  "DPW - Debris Removal - DPW USE ONLY", 
                             "Rodent Extermination - BSEED Only", "Squatters Issue"))

#add the parcel geometry to the dataset with the the blight violation tallies
blight_violation_tallies <- parcels_with_labels %>% select(geometry, parcelnum) %>% 
  inner_join(blight_violation_tallies, by = "parcelnum")

improve_detroit_issues_tallies <- improve_detroit_issues_property %>% 
  st_join(blight_violation_tallies, join = st_within) %>% 
  as.tibble() %>% select(-geometry) %>% count(parcelnum)
## although coordinates are longitude/latitude, st_within assumes that they are planar
#note the distribution of the tally numbers
improve_detroit_issues_tallies %>% ggplot() +
  geom_bar(aes(x = as.factor(n)))

#add the tallies to the improve_detroit_issues_tallies set
improve_detroit_and_violations_tallies <- blight_violation_tallies %>% as.data.frame() %>% 
  select(-geometry) %>% left_join(improve_detroit_issues_tallies, by = "parcelnum") %>%
  mutate(improve_issues_tallies = ifelse(is.na(n), 0, n)) %>% select(-n)

We add this data to our models, and again try both decision-tree and logistic-regression methods.

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- improve_detroit_and_violations_tallies %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + improve_issues_tallies

models <- 1:10 %>% map(~ glm(formula, data = train[-k_folds[[.x]],], family = "binomial"))

predictions <- 1:10 %>% map(~ predict.glm(models[[.x]], newdata = train[k_folds[[.x]],],
                                      type = "response"))
accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.5902281
sd(as.numeric(accuracies))
## [1] 0.03057918
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 153 114
##   TRUE   26  71
##        truth
## pred      0   1
##   FALSE 157  99
##   TRUE   42  66
##        truth
## pred      0   1
##   FALSE 148 124
##   TRUE   33  60
##        truth
## pred      0   1
##   FALSE 159 104
##   TRUE   30  71
##        truth
## pred      0   1
##   FALSE 134 132
##   TRUE   30  68
##        truth
## pred      0   1
##   FALSE 165 110
##   TRUE   36  53
##        truth
## pred      0   1
##   FALSE 147 117
##   TRUE   39  61
##        truth
## pred      0   1
##   FALSE 161 108
##   TRUE   36  59
##        truth
## pred      0   1
##   FALSE 143 131
##   TRUE   38  52
##        truth
## pred      0   1
##   FALSE 152 111
##   TRUE   32  69
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- improve_detroit_and_violations_tallies %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

#use the same formula as in the logistic model above
models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6089003
sd(as.numeric(accuracies))
## [1] 0.02508683
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 117  67
##   TRUE   69 111
##        truth
## pred      0   1
##   FALSE 130  77
##   TRUE   56 101
##        truth
## pred      0   1
##   FALSE 139  97
##   TRUE   48  81
##        truth
## pred      0   1
##   FALSE 124  72
##   TRUE   62 106
##        truth
## pred      0   1
##   FALSE 127  72
##   TRUE   59 106
##        truth
## pred      0   1
##   FALSE 142  97
##   TRUE   44  81
##        truth
## pred      0   1
##   FALSE 104  77
##   TRUE   82 101
##        truth
## pred      0   1
##   FALSE 118  76
##   TRUE   68 102
##        truth
## pred      0   1
##   FALSE 119  85
##   TRUE   67  93
##        truth
## pred      0   1
##   FALSE 119  82
##   TRUE   67  96

We now try to work with the crime data

library(units)
## 
## Attaching package: 'units'
## The following object is masked from 'package:base':
## 
##     %*%
library(lubridate)

#improve_detroit_and_violations_tallies <- improve_detroit_and_violations_tallies %>%
#  as.data.frame() %>% select(-geometry)

#add the parcel geometry to the data we are using for modeling
improve_detroit_and_violations_tallies <- parcels_with_labels %>% select(-blighted) %>%
  left_join(improve_detroit_and_violations_tallies, by = "parcelnum") %>% st_sf()

improve_detroit_and_violations_tallies <- improve_detroit_and_violations_tallies %>% st_sf()

distance <- set_units(200, "m")  #for crimes within 200 meters

#enlarge the parcel geometries by the amount specified in distance, so as to use a spacial join to count events occurring near the parcel
tallies_with_later_crime <- improve_detroit_and_violations_tallies %>% 
  st_transform(2253) %>% st_buffer(distance)

#ggplot(improve_detroit_and_violations_tallies[100,]) + geom_sf() 

tallies_with_later_crime <- tallies_with_later_crime %>%
  st_join(crime_12062016_to_03192018 %>% st_transform(2253),
          join = st_contains)

#count the number of recent recorded crimes within 500 meters, noting that every parcel has at least one
tallies_with_later_crime <- tallies_with_later_crime %>% as.data.frame() %>% select(-geometry) %>% 
  group_by(parcelnum) %>% summarize(later_recorded_crime_incidents = n())

#now tally the earlier crime incidents, by roughly repeating the last few steps

#add the parcel geometry to the data we are using for modeling
#tallies_with_earlier_crime <- parcels_with_labels %>% select(-blighted) %>%
#  left_join(improve_detroit_and_violations_tallies, by = "parcelnum") %>% st_sf()

#improve_detroit_and_violations_tallies <- improve_detroit_and_violations_tallies %>% st_sf()

earlier_crime_sf <- crime_to_12062016_sf %>% 
  mutate(INCIDENTDATE = mdy(str_sub(INCIDENTDATE, 1, 8))) %>%
  filter(INCIDENTDATE > mdy("12/31/2014")) %>%
  mutate(general_type = case_when(
  CATEGORY %in% c("AGGRAVATED ASSAULT", "ROBBERY", "ASSAULT", "HOMICIDE") ~ "Violent Crime",
  CATEGORY %in% c("BURGLARY", "STOLEN PROPERTY", "STOLEN VEHICLE", "DAMAGE TO PROPERTY", 
                  "OTHER BURGLARY", "LARCENY") ~ "Property Crime",
  CATEGORY %in% c("SEX OFFENSES", "SOLICITATION") ~ "Sex Offences",
  CATEGORY %in% c("VAGRANCY (OTHER)") ~ "Vagrancy"))
  
#enlarge the parcel geometries by the amount specified in distance, so as to use a spacial join to count events occurring near the parcel
tallies_with_earlier_property_crime <- improve_detroit_and_violations_tallies %>% 
  st_transform(2253) %>% st_buffer(distance)

tallies_with_earlier_property_crime <- tallies_with_earlier_property_crime %>%
  st_join(earlier_crime_sf %>% filter(general_type == "Property Crime") %>% st_transform(2253),
          join = st_contains)

#count the number of recent recorded crimes within 200 meters, noting that every parcel has at least one
tallies_with_earlier_property_crime <- tallies_with_earlier_property_crime %>% as.data.frame() %>% select(-geometry) %>% 
  group_by(parcelnum) %>% summarize(earlier_property_crime_incidents = n())



#enlarge the parcel geometries by the amount specified in distance, so as to use a spacial join to count events occurring near the parcel
tallies_with_earlier_violent_crime <- improve_detroit_and_violations_tallies %>% 
  st_transform(2253) %>% st_buffer(distance)

tallies_with_earlier_violent_crime <- tallies_with_earlier_violent_crime %>%
  st_join(earlier_crime_sf %>% filter(general_type == "Violent Crime") %>% st_transform(2253),
          join = st_contains)

#count the number of recent recorded crimes within 200 meters, noting that every parcel has at least one
tallies_with_earlier_violent_crime <- tallies_with_earlier_violent_crime %>% 
  as.data.frame() %>% select(-geometry) %>% 
  group_by(parcelnum) %>% summarize(earlier_violent_crime_incidents = n())

tallies_with_crime <- tallies_with_later_crime %>%
  inner_join(improve_detroit_and_violations_tallies, by = "parcelnum") %>%
  inner_join(tallies_with_earlier_property_crime, by = "parcelnum") %>%
  inner_join(tallies_with_earlier_violent_crime, by = "parcelnum")

rm(tallies_with_earlier_property_crime, tallies_with_later_crime, tallies_with_earlier_violent_crime)
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_crime %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

#n the number of reported crimtes
formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + 
  improve_issues_tallies + earlier_property_crime_incidents + 
  earlier_violent_crime_incidents

models <- 1:10 %>% map(~ glm(formula, data = train[-k_folds[[.x]],], family = "binomial"))

predictions <- 1:10 %>% map(~ predict.glm(models[[.x]], newdata = train[k_folds[[.x]],],
                                      type = "response"))
accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6498254
sd(as.numeric(accuracies))
## [1] 0.02427364
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 125  71
##   TRUE   58 110
##        truth
## pred      0   1
##   FALSE 122  55
##   TRUE   59 128
##        truth
## pred      0   1
##   FALSE 134  69
##   TRUE   64  98
##        truth
## pred      0   1
##   FALSE 132  59
##   TRUE   60 113
##        truth
## pred      0   1
##   FALSE 133  66
##   TRUE   53 112
##        truth
## pred      0   1
##   FALSE 110  72
##   TRUE   67 115
##        truth
## pred      0   1
##   FALSE 123  64
##   TRUE   77 100
##        truth
## pred      0   1
##   FALSE 128  62
##   TRUE   61 113
##        truth
## pred      0   1
##   FALSE 115  84
##   TRUE   47 118
##        truth
## pred      0   1
##   FALSE 123  57
##   TRUE   70 114
models[4]
## [[1]]
## 
## Call:  glm(formula = formula, family = "binomial", data = train[-k_folds[[.x]], 
##     ])
## 
## Coefficients:
##                      (Intercept)             total_fines_by_parcel  
##                        4.678e-01                         8.229e-05  
##   number_of_violations_by_parcel            improve_issues_tallies  
##                        9.462e-02                         8.764e-02  
## earlier_property_crime_incidents   earlier_violent_crime_incidents  
##                       -1.176e-02                         1.360e-02  
## 
## Degrees of Freedom: 3276 Total (i.e. Null);  3271 Residual
## Null Deviance:       4542 
## Residual Deviance: 4201  AIC: 4213
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_crime %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

#use the same formula as in the logistic model above
models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.643232
sd(as.numeric(accuracies))
## [1] 0.01506849
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 112  52
##   TRUE   74 126
##        truth
## pred      0   1
##   FALSE 111  58
##   TRUE   75 120
##        truth
## pred      0   1
##   FALSE 128  74
##   TRUE   59 104
##        truth
## pred      0   1
##   FALSE 119  66
##   TRUE   67 112
##        truth
## pred      0   1
##   FALSE 127  62
##   TRUE   59 116
##        truth
## pred      0   1
##   FALSE  98  46
##   TRUE   88 132
##        truth
## pred      0   1
##   FALSE 112  49
##   TRUE   74 129
##        truth
## pred      0   1
##   FALSE 123  75
##   TRUE   63 103
##        truth
## pred      0   1
##   FALSE 130  70
##   TRUE   56 108
##        truth
## pred      0   1
##   FALSE 127  73
##   TRUE   59 105
models[[10]]
## n= 3277 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 3277 1602 0 (0.5111382 0.4888618)  
##    2) total_fines_by_parcel< 25 1728  654 0 (0.6215278 0.3784722)  
##      4) earlier_property_crime_incidents>=154.5 1341  445 0 (0.6681581 0.3318419) *
##      5) earlier_property_crime_incidents< 154.5 387  178 1 (0.4599483 0.5400517)  
##       10) earlier_violent_crime_incidents< 70.5 177   76 0 (0.5706215 0.4293785)  
##         20) earlier_property_crime_incidents>=106.5 68   11 0 (0.8382353 0.1617647) *
##         21) earlier_property_crime_incidents< 106.5 109   44 1 (0.4036697 0.5963303) *
##       11) earlier_violent_crime_incidents>=70.5 210   77 1 (0.3666667 0.6333333) *
##    3) total_fines_by_parcel>=25 1549  601 1 (0.3879923 0.6120077)  
##      6) earlier_property_crime_incidents>=179.5 1082  485 1 (0.4482440 0.5517560)  
##       12) total_fines_by_parcel< 512.5 501  239 0 (0.5229541 0.4770459)  
##         24) earlier_violent_crime_incidents< 163.5 225   80 0 (0.6444444 0.3555556) *
##         25) earlier_violent_crime_incidents>=163.5 276  117 1 (0.4239130 0.5760870) *
##       13) total_fines_by_parcel>=512.5 581  223 1 (0.3838210 0.6161790) *
##      7) earlier_property_crime_incidents< 179.5 467  116 1 (0.2483940 0.7516060) *
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_crime %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

#Random Forest, using the same formula as in the logistic and rpart models (using 500 decision trees)
models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6437799
sd(as.numeric(accuracies))
## [1] 0.0238321
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 139  76
##   TRUE   47 102
##        truth
## pred      0   1
##   FALSE 120  73
##   TRUE   66 105
##        truth
## pred      0   1
##   FALSE 134  78
##   TRUE   53 100
##        truth
## pred      0   1
##   FALSE 128  79
##   TRUE   58  99
##        truth
## pred      0   1
##   FALSE 132  71
##   TRUE   54 107
##        truth
## pred      0   1
##   FALSE 128  75
##   TRUE   58 103
##        truth
## pred      0   1
##   FALSE 141  66
##   TRUE   45 112
##        truth
## pred      0   1
##   FALSE 124  78
##   TRUE   62 100
##        truth
## pred      0   1
##   FALSE 138  80
##   TRUE   48  98
##        truth
## pred      0   1
##   FALSE 146  90
##   TRUE   40  88

Now add some constant properties of the parcels to the models, to see if they improve the model fit.

#add the parcel area, to see if it improves
tallies_with_area <- tallies_with_crime %>% 
  left_join(parcel_sf %>% as.data.frame() %>% select(parcelnum, total_acre), 
            by = "parcelnum") %>% select(-geometry)
  
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_area %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

#n the number of reported crimtes
formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + 
  improve_issues_tallies + earlier_property_crime_incidents + 
  earlier_violent_crime_incidents + total_acre

models <- 1:10 %>% map(~ glm(formula, data = train[-k_folds[[.x]],], family = "binomial"))

predictions <- 1:10 %>% map(~ predict.glm(models[[.x]], newdata = train[k_folds[[.x]],],
                                      type = "response"))
accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6503763
sd(as.numeric(accuracies))
## [1] 0.0249872
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 125  71
##   TRUE   58 110
##        truth
## pred      0   1
##   FALSE 123  55
##   TRUE   58 128
##        truth
## pred      0   1
##   FALSE 133  70
##   TRUE   65  97
##        truth
## pred      0   1
##   FALSE 133  59
##   TRUE   59 113
##        truth
## pred      0   1
##   FALSE 133  66
##   TRUE   53 112
##        truth
## pred      0   1
##   FALSE 110  71
##   TRUE   67 116
##        truth
## pred      0   1
##   FALSE 123  64
##   TRUE   77 100
##        truth
## pred      0   1
##   FALSE 128  62
##   TRUE   61 113
##        truth
## pred      0   1
##   FALSE 116  84
##   TRUE   46 118
##        truth
## pred      0   1
##   FALSE 124  58
##   TRUE   69 113

Try and test an rpart model

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_area %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

#use the same formula as with the glm model above
models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6358151
sd(as.numeric(accuracies))
## [1] 0.0215054
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 120  58
##   TRUE   66 120
##        truth
## pred      0   1
##   FALSE 120  62
##   TRUE   66 116
##        truth
## pred      0   1
##   FALSE 122  69
##   TRUE   65 109
##        truth
## pred      0   1
##   FALSE 120  70
##   TRUE   66 108
##        truth
## pred      0   1
##   FALSE 122  64
##   TRUE   64 114
##        truth
## pred      0   1
##   FALSE 104  60
##   TRUE   82 118
##        truth
## pred      0   1
##   FALSE 116  58
##   TRUE   70 120
##        truth
## pred      0   1
##   FALSE 124  72
##   TRUE   62 106
##        truth
## pred      0   1
##   FALSE 136  74
##   TRUE   50 104
##        truth
## pred      0   1
##   FALSE 120  82
##   TRUE   66  96
models[[9]]
## n= 3277 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 3277 1602 0 (0.5111382 0.4888618)  
##    2) total_fines_by_parcel< 35 1730  664 0 (0.6161850 0.3838150)  
##      4) total_acre>=0.1065 854  244 0 (0.7142857 0.2857143) *
##      5) total_acre< 0.1065 876  420 0 (0.5205479 0.4794521)  
##       10) earlier_property_crime_incidents>=154.5 632  256 0 (0.5949367 0.4050633) *
##       11) earlier_property_crime_incidents< 154.5 244   80 1 (0.3278689 0.6721311) *
##    3) total_fines_by_parcel>=35 1547  609 1 (0.3936652 0.6063348)  
##      6) earlier_property_crime_incidents>=179.5 1076  487 1 (0.4526022 0.5473978)  
##       12) total_fines_by_parcel< 512.5 511  235 0 (0.5401174 0.4598826)  
##         24) earlier_violent_crime_incidents< 163.5 234   83 0 (0.6452991 0.3547009) *
##         25) earlier_violent_crime_incidents>=163.5 277  125 1 (0.4512635 0.5487365)  
##           50) earlier_property_crime_incidents>=356.5 51   17 0 (0.6666667 0.3333333) *
##           51) earlier_property_crime_incidents< 356.5 226   91 1 (0.4026549 0.5973451) *
##       13) total_fines_by_parcel>=512.5 565  211 1 (0.3734513 0.6265487) *
##      7) earlier_property_crime_incidents< 179.5 471  122 1 (0.2590234 0.7409766) *

Try and test a random forest model.

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_area %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

#use the same formula as with the glm and rpart models above
models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6657451
sd(as.numeric(accuracies))
## [1] 0.03212965
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 138  55
##   TRUE   48 123
##        truth
## pred      0   1
##   FALSE 118  70
##   TRUE   68 108
##        truth
## pred      0   1
##   FALSE 140  67
##   TRUE   47 111
##        truth
## pred      0   1
##   FALSE 128  81
##   TRUE   58  97
##        truth
## pred      0   1
##   FALSE 136  63
##   TRUE   50 115
##        truth
## pred      0   1
##   FALSE 137  70
##   TRUE   49 108
##        truth
## pred      0   1
##   FALSE 137  63
##   TRUE   49 115
##        truth
## pred      0   1
##   FALSE 130  69
##   TRUE   56 109
##        truth
## pred      0   1
##   FALSE 139  78
##   TRUE   47 100
##        truth
## pred      0   1
##   FALSE 137  80
##   TRUE   49  98
#add the parcel area, to see if it improves
tallies_with_district <- tallies_with_area %>% 
  left_join(parcel_sf %>% as.data.frame() %>% select(parcelnum, council_di), 
            by = "parcelnum")
  
tallies_with_district$council_di <- as.character(tallies_with_district$council_di)

district_recording_errors <- tallies_with_district %>% 
  filter(! council_di %in% c("01", "02", "03", "04", "05", "06", "07"))

district_recording_errors <- parcels_with_labels %>% select(parcelnum) %>%
  inner_join(district_recording_errors, by = "parcelnum")

district_recording_errors <- district_recording_errors %>% 
  st_join(city_council_districts %>% select(districts), join = st_within)
## although coordinates are longitude/latitude, st_within assumes that they are planar
district_recording_errors <- district_recording_errors %>%
  mutate(council_di = districts) %>% as.data.frame() %>% 
  select(-districts, -geometry) %>% as.tibble()

tallies_with_district <- tallies_with_district %>%
  filter(council_di %in% c("01", "02", "03", "04", "05", "06", "07")) %>%
  rbind(district_recording_errors)

#with the obsevation that, for example, district 7 is currently represented by both of the strings "07" and "7", we make the representation constant
tallies_with_district <- tallies_with_district %>%
  mutate(council_di = as.integer(council_di)) %>%
  mutate(council_di = as.factor(council_di))

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_district %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + 
  improve_issues_tallies + earlier_property_crime_incidents + 
  earlier_violent_crime_incidents + total_acre + council_di

models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6380047
sd(as.numeric(accuracies))
## [1] 0.02938452
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 120  53
##   TRUE   66 125
##        truth
## pred      0   1
##   FALSE 123  61
##   TRUE   63 117
##        truth
## pred      0   1
##   FALSE 122  58
##   TRUE   65 120
##        truth
## pred      0   1
##   FALSE 116  68
##   TRUE   70 110
##        truth
## pred      0   1
##   FALSE 116  59
##   TRUE   70 119
##        truth
## pred      0   1
##   FALSE 119  87
##   TRUE   67  91
##        truth
## pred      0   1
##   FALSE 118  68
##   TRUE   68 110
##        truth
## pred      0   1
##   FALSE 110  52
##   TRUE   76 126
##        truth
## pred      0   1
##   FALSE 118  57
##   TRUE   68 121
##        truth
## pred      0   1
##   FALSE 133  89
##   TRUE   53  89
models[[1]]
## n= 3277 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 3277 1602 0 (0.5111382 0.4888618)  
##    2) total_fines_by_parcel< 35 1743  666 0 (0.6179002 0.3820998)  
##      4) total_acre>=0.1065 866  247 0 (0.7147806 0.2852194) *
##      5) total_acre< 0.1065 877  419 0 (0.5222349 0.4777651)  
##       10) earlier_property_crime_incidents>=189.5 532  206 0 (0.6127820 0.3872180) *
##       11) earlier_property_crime_incidents< 189.5 345  132 1 (0.3826087 0.6173913) *
##    3) total_fines_by_parcel>=35 1534  598 1 (0.3898305 0.6101695)  
##      6) earlier_property_crime_incidents>=179.5 1068  478 1 (0.4475655 0.5524345)  
##       12) total_fines_by_parcel< 512.5 503  235 0 (0.5328032 0.4671968)  
##         24) earlier_violent_crime_incidents< 176.5 290  115 0 (0.6034483 0.3965517) *
##         25) earlier_violent_crime_incidents>=176.5 213   93 1 (0.4366197 0.5633803) *
##       13) total_fines_by_parcel>=512.5 565  210 1 (0.3716814 0.6283186) *
##      7) earlier_property_crime_incidents< 179.5 466  120 1 (0.2575107 0.7424893) *

Now try random forest, with the dataset and formula above.

#same formula as in rpart model above
models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6657474
sd(as.numeric(accuracies))
## [1] 0.01953805
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 131  63
##   TRUE   55 115
##        truth
## pred      0   1
##   FALSE 134  63
##   TRUE   52 115
##        truth
## pred      0   1
##   FALSE 141  71
##   TRUE   46 107
##        truth
## pred      0   1
##   FALSE 126  70
##   TRUE   60 108
##        truth
## pred      0   1
##   FALSE 134  69
##   TRUE   52 109
##        truth
## pred      0   1
##   FALSE 134  78
##   TRUE   52 100
##        truth
## pred      0   1
##   FALSE 127  73
##   TRUE   59 105
##        truth
## pred      0   1
##   FALSE 128  59
##   TRUE   58 119
##        truth
## pred      0   1
##   FALSE 130  56
##   TRUE   56 122
##        truth
## pred      0   1
##   FALSE 140  79
##   TRUE   46  99

Now add the frontage (from parcel data). I wouldn’t expect this to add very much to the value of the model (given that we have already incuded area), but let’s see.

tallies_with_frontage <- tallies_with_district %>% 
  left_join(parcel_sf %>% as.data.frame() %>% select(parcelnum, frontage), by = "parcelnum")

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_frontage %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + 
  improve_issues_tallies + earlier_property_crime_incidents + 
  earlier_violent_crime_incidents + total_acre + council_di + frontage

models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6448736
sd(as.numeric(accuracies))
## [1] 0.02837062
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 134  58
##   TRUE   52 120
##        truth
## pred      0   1
##   FALSE 112  59
##   TRUE   74 119
##        truth
## pred      0   1
##   FALSE 122  59
##   TRUE   65 119
##        truth
## pred      0   1
##   FALSE 120  65
##   TRUE   66 113
##        truth
## pred      0   1
##   FALSE 114  56
##   TRUE   72 122
##        truth
## pred      0   1
##   FALSE 123  84
##   TRUE   63  94
##        truth
## pred      0   1
##   FALSE 126  75
##   TRUE   60 103
##        truth
## pred      0   1
##   FALSE 114  54
##   TRUE   72 124
##        truth
## pred      0   1
##   FALSE 127  61
##   TRUE   59 117
##        truth
## pred      0   1
##   FALSE 136  89
##   TRUE   50  89
models[[2]]
## n= 3277 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 3277 1602 0 (0.5111382 0.4888618)  
##    2) total_fines_by_parcel< 287.5 2216  912 0 (0.5884477 0.4115523)  
##      4) frontage>=35.5 1310  425 0 (0.6755725 0.3244275) *
##      5) frontage< 35.5 906  419 1 (0.4624724 0.5375276)  
##       10) earlier_property_crime_incidents>=154.5 641  295 0 (0.5397816 0.4602184)  
##         20) council_di=3,6,7 291  106 0 (0.6357388 0.3642612) *
##         21) council_di=1,2,4,5 350  161 1 (0.4600000 0.5400000) *
##       11) earlier_property_crime_incidents< 154.5 265   73 1 (0.2754717 0.7245283) *
##    3) total_fines_by_parcel>=287.5 1061  371 1 (0.3496701 0.6503299) *

Random forest again:

#same formula as in rpart model above
models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6759055
sd(as.numeric(accuracies))
## [1] 0.0196406
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 131  66
##   TRUE   55 112
##        truth
## pred      0   1
##   FALSE 130  62
##   TRUE   56 116
##        truth
## pred      0   1
##   FALSE 143  64
##   TRUE   44 114
##        truth
## pred      0   1
##   FALSE 129  66
##   TRUE   57 112
##        truth
## pred      0   1
##   FALSE 132  68
##   TRUE   54 110
##        truth
## pred      0   1
##   FALSE 134  71
##   TRUE   52 107
##        truth
## pred      0   1
##   FALSE 132  73
##   TRUE   54 105
##        truth
## pred      0   1
##   FALSE 133  52
##   TRUE   53 126
##        truth
## pred      0   1
##   FALSE 129  56
##   TRUE   57 122
##        truth
## pred      0   1
##   FALSE 142  76
##   TRUE   44 102

We look at the effect of parcels in the region with no buildings on them. Many of these will be lots on which buildings have been dismantled. (However, could some of them be public parks or the like?)

distance <- set_units(200, "m")  #for crimes within 100 meters

vacant_parcel_buffers <- parcel_sf %>% select(parcelnum) %>% 
  inner_join(tallies_with_frontage, by = "parcelnum") %>%
  st_transform(2253) %>% st_buffer(distance)
  
vacant_parcel_join <- vacant_parcel_buffers %>% 
  st_join(parcel_sf %>% st_transform(2253) %>% filter(num_buildi == 0) %>% select(parcelnum), 
          join = st_contains) %>%
  filter(parcelnum.x != parcelnum.y) %>% 
  group_by(parcelnum.x) %>% summarize(num_vacant_parcels = n())

tallies_with_vacant_parcels <- tallies_with_frontage %>% 
  left_join(vacant_parcel_join %>% as.data.frame() %>% select(-geometry),
            by = c("parcelnum" = "parcelnum.x"))

tallies_with_vacant_parcels <- tallies_with_vacant_parcels %>%
  mutate(num_vacant_parcels = ifelse(is.na(num_vacant_parcels), 0, num_vacant_parcels))

Model with rpart:

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_vacant_parcels %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)


formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + 
  improve_issues_tallies + earlier_property_crime_incidents + 
  earlier_violent_crime_incidents + total_acre + council_di + frontage + num_vacant_parcels

models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6808573
sd(as.numeric(accuracies))
## [1] 0.02008773
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 113  51
##   TRUE   73 127
##        truth
## pred      0   1
##   FALSE 138  69
##   TRUE   48 109
##        truth
## pred      0   1
##   FALSE 135  65
##   TRUE   52 113
##        truth
## pred      0   1
##   FALSE 123  54
##   TRUE   63 124
##        truth
## pred      0   1
##   FALSE 123  39
##   TRUE   63 139
##        truth
## pred      0   1
##   FALSE 122  60
##   TRUE   64 118
##        truth
## pred      0   1
##   FALSE 125  64
##   TRUE   61 114
##        truth
## pred      0   1
##   FALSE 132  61
##   TRUE   54 117
##        truth
## pred      0   1
##   FALSE 110  37
##   TRUE   76 141
##        truth
## pred      0   1
##   FALSE 141  63
##   TRUE   45 115
models[[7]]
## n= 3277 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 3277 1602 0 (0.5111382 0.4888618)  
##   2) num_vacant_parcels< 41.5 1672  525 0 (0.6860048 0.3139952) *
##   3) num_vacant_parcels>=41.5 1605  528 1 (0.3289720 0.6710280) *

Now try random forest

models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7033795
sd(as.numeric(accuracies))
## [1] 0.02465524
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 123  50
##   TRUE   63 128
##        truth
## pred      0   1
##   FALSE 128  59
##   TRUE   58 119
##        truth
## pred      0   1
##   FALSE 140  63
##   TRUE   47 115
##        truth
## pred      0   1
##   FALSE 129  56
##   TRUE   57 122
##        truth
## pred      0   1
##   FALSE 135  43
##   TRUE   51 135
##        truth
## pred      0   1
##   FALSE 137  54
##   TRUE   49 124
##        truth
## pred      0   1
##   FALSE 125  60
##   TRUE   61 118
##        truth
## pred      0   1
##   FALSE 126  46
##   TRUE   60 132
##        truth
## pred      0   1
##   FALSE 125  33
##   TRUE   61 145
##        truth
## pred      0   1
##   FALSE 131  54
##   TRUE   55 124

With the idea that blighted buildings may tend to come in clusters, we may guess that blightedness in one buidling may indicate that nearby buildings are more likely to become blighted. We thus, for each parcel we are using for training and testing, count the number of buildings that became blighted before June of 2016, before the time over which we are attempting carry out our predictions.

library(lubridate)

#identify the earlier completed demolutions and put the results into a form that can be combined with the dismantle permits resutls
earlier_completed_demolitions_sf <- completed_demolitions_sf %>% 
  mutate(Date = `Demolition Date`, parcelnum = `Parcel ID`) %>%
  select(Date, parcelnum) %>% filter(Date < mdy("06/01/2016"))

#repeat for the dismantle permit results
earlier_dismantle_permits_sf <- dismantle_permits_sf %>%
  mutate(Date = `Permit Issued`, parcelnum = `Parcel Number`) %>%
  select(Date, parcelnum) %>% filter(Date < mdy("06/01/2016"))

#combine the above results, and ensure that parcels being counted are not the parcels that we are using for our modeling dataset
earlier_blighted_parcels <- rbind(earlier_completed_demolitions_sf, earlier_dismantle_permits_sf) %>%
  as.data.frame() %>% select(-geometry) %>% distinct(parcelnum) %>% 
  filter(! parcelnum %in% tallies_with_vacant_parcels$parcelnum)

#add the parcel geometry to the set of earlier blighted parcels
earlier_blighted_parcels <- parcel_sf %>% select(parcelnum) %>% 
  inner_join(earlier_blighted_parcels, by = "parcelnum")

#set a distance of 100 meters
distance <- set_units(100, "m") #for setting a buffer of 100 meter around each relevant parcel

#expand the geometry for each parcel by 100 meters
buffered_parcels <- parcel_sf %>% select(parcelnum) %>%
  inner_join(tallies_with_vacant_parcels, by = "parcelnum") %>%
  st_transform(2253) %>% st_buffer(distance)

#count the numbers of blighted parcels within the expanded geometry (excluding the parcel for which we are counting)
tallies_of_earlier_blighted_parcels <- buffered_parcels %>% select(parcelnum) %>%
  st_join(earlier_blighted_parcels %>% st_transform(2253), 
          join = st_contains, left = FALSE) %>% filter(parcelnum.x != parcelnum.y)

tallies_of_earlier_blighted_parcels <- tallies_of_earlier_blighted_parcels %>%
  as.data.frame() %>% select(-geometry) %>%
  group_by(parcelnum.x) %>% summarize(num_nearby_blighted_parcels = n())

#our modeling dataset, now with the new quantity
tallies_with_earlier_blighted_parcels <- tallies_with_vacant_parcels %>%
  left_join(tallies_of_earlier_blighted_parcels, by = c("parcelnum" = "parcelnum.x")) %>%
  mutate(num_nearby_blighted_parcels = ifelse(is.na(num_nearby_blighted_parcels), 0,
                                              num_nearby_blighted_parcels))

rm(earlier_completed_demolitions_sf, earlier_dismantle_permits_sf, buffered_parcels, earlier_blighted_parcels, 
   tallies_of_earlier_blighted_parcels)

We now try an rpart model, now including the earlier demolition data.

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_earlier_blighted_parcels %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + 
  improve_issues_tallies + earlier_property_crime_incidents + 
  earlier_violent_crime_incidents + total_acre + council_di + frontage + num_vacant_parcels +
  num_nearby_blighted_parcels

models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6976065
sd(as.numeric(accuracies))
## [1] 0.02825368
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 118  42
##   TRUE   68 136
##        truth
## pred      0   1
##   FALSE 107  42
##   TRUE   79 136
##        truth
## pred      0   1
##   FALSE 124  42
##   TRUE   63 136
##        truth
## pred      0   1
##   FALSE 134  52
##   TRUE   52 126
##        truth
## pred      0   1
##   FALSE 133  38
##   TRUE   53 140
##        truth
## pred      0   1
##   FALSE 121  44
##   TRUE   65 134
##        truth
## pred      0   1
##   FALSE 113  45
##   TRUE   73 133
##        truth
## pred      0   1
##   FALSE 105  47
##   TRUE   81 131
##        truth
## pred      0   1
##   FALSE 118  38
##   TRUE   68 140
##        truth
## pred      0   1
##   FALSE 114  37
##   TRUE   72 141
models[[7]]
## n= 3277 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 3277 1602 0 (0.5111382 0.4888618)  
##    2) num_nearby_blighted_parcels< 1.5 1561  456 0 (0.7078796 0.2921204)  
##      4) total_fines_by_parcel< 125 1000  186 0 (0.8140000 0.1860000) *
##      5) total_fines_by_parcel>=125 561  270 0 (0.5187166 0.4812834)  
##       10) num_vacant_parcels< 54.5 445  185 0 (0.5842697 0.4157303) *
##       11) num_vacant_parcels>=54.5 116   31 1 (0.2672414 0.7327586) *
##    3) num_nearby_blighted_parcels>=1.5 1716  570 1 (0.3321678 0.6678322) *

Now try random forest:

models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7154614
sd(as.numeric(accuracies))
## [1] 0.01716549
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 124  39
##   TRUE   62 139
##        truth
## pred      0   1
##   FALSE 134  54
##   TRUE   52 124
##        truth
## pred      0   1
##   FALSE 140  55
##   TRUE   47 123
##        truth
## pred      0   1
##   FALSE 128  45
##   TRUE   58 133
##        truth
## pred      0   1
##   FALSE 140  48
##   TRUE   46 130
##        truth
## pred      0   1
##   FALSE 130  48
##   TRUE   56 130
##        truth
## pred      0   1
##   FALSE 122  52
##   TRUE   64 126
##        truth
## pred      0   1
##   FALSE 123  46
##   TRUE   63 132
##        truth
## pred      0   1
##   FALSE 131  41
##   TRUE   55 137
##        truth
## pred      0   1
##   FALSE 130  49
##   TRUE   56 129

Now we add the fines for blight violations for nearby parcels.

parcels <- parcel_sf %>% select(parcelnum) %>% 
  inner_join(tallies_with_earlier_blighted_parcels %>% select(parcelnum), 
             by = "parcelnum")

distance <- distance <- set_units(100, "m")

buffered_parcels <- parcels %>% st_transform(2253) %>% st_buffer(distance)

blight_violation_points_in_buffer <- blight_violations_sf %>% 
  st_transform(2253) %>% st_join(buffered_parcels, join = st_within) %>%
  filter(`Violation Parcel ID` != parcelnum)

violations_at_neighboring_parcels <- blight_violation_points_in_buffer %>% 
  group_by(parcelnum) %>% 
  summarize(num_violations_nearby_parcels = n(),
            sum_fines_nearby_parcels = sum(as.numeric(str_sub(`Fine Amount`, start = 2)), na.rm = TRUE))

tallies_with_nearby_violations <- tallies_with_earlier_blighted_parcels %>%
  left_join(violations_at_neighboring_parcels %>% as.data.frame() %>% select(-geometry),
            by = "parcelnum")

tallies_with_nearby_violations <- tallies_with_nearby_violations %>%
  mutate(sum_fines_nearby_parcels = 
           ifelse(is.na(sum_fines_nearby_parcels), 0, sum_fines_nearby_parcels),
         num_violations_nearby_parcels = 
           ifelse(is.na(num_violations_nearby_parcels), 0, num_violations_nearby_parcels))

We use this dataset to construct an rpart model:

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_nearby_violations %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(515)
k_folds <- caret::createFolds(train$blighted)

formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + 
  improve_issues_tallies + earlier_property_crime_incidents + 
  earlier_violent_crime_incidents + total_acre + council_di + frontage + num_vacant_parcels +
  num_nearby_blighted_parcels + num_violations_nearby_parcels

models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6973355
sd(as.numeric(accuracies))
## [1] 0.02010073
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 107  30
##   TRUE   80 148
##        truth
## pred      0   1
##   FALSE 117  44
##   TRUE   69 134
##        truth
## pred      0   1
##   FALSE 112  45
##   TRUE   74 133
##        truth
## pred      0   1
##   FALSE 129  60
##   TRUE   57 118
##        truth
## pred      0   1
##   FALSE 115  39
##   TRUE   71 139
##        truth
## pred      0   1
##   FALSE 121  46
##   TRUE   65 132
##        truth
## pred      0   1
##   FALSE 130  39
##   TRUE   56 139
##        truth
## pred      0   1
##   FALSE 116  31
##   TRUE   70 147
##        truth
## pred      0   1
##   FALSE 133  63
##   TRUE   53 115
##        truth
## pred      0   1
##   FALSE 120  44
##   TRUE   66 134
models[[9]]
## n= 3277 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 3277 1602 0 (0.5111382 0.4888618)  
##    2) num_nearby_blighted_parcels< 1.5 1552  454 0 (0.7074742 0.2925258)  
##      4) total_fines_by_parcel< 125 1005  190 0 (0.8109453 0.1890547) *
##      5) total_fines_by_parcel>=125 547  264 0 (0.5173675 0.4826325)  
##       10) num_vacant_parcels< 52.5 422  172 0 (0.5924171 0.4075829) *
##       11) num_vacant_parcels>=52.5 125   33 1 (0.2640000 0.7360000) *
##    3) num_nearby_blighted_parcels>=1.5 1725  577 1 (0.3344928 0.6655072)  
##      6) num_nearby_blighted_parcels< 4.5 868  380 1 (0.4377880 0.5622120)  
##       12) num_vacant_parcels< 65.5 550  275 0 (0.5000000 0.5000000)  
##         24) total_fines_by_parcel< 25 262  105 0 (0.5992366 0.4007634) *
##         25) total_fines_by_parcel>=25 288  118 1 (0.4097222 0.5902778) *
##       13) num_vacant_parcels>=65.5 318  105 1 (0.3301887 0.6698113) *
##      7) num_nearby_blighted_parcels>=4.5 857  197 1 (0.2298716 0.7701284) *
models <- 1:10 %>% map(~ glm(formula, data = train[-k_folds[[.x]],], family = "binomial"))

predictions <- 1:10 %>% map(~ predict.glm(models[[.x]], newdata = train[k_folds[[.x]],],
                                      type = "response"))
accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))


#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7044724
sd(as.numeric(accuracies))
## [1] 0.02181718
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 142  57
##   TRUE   45 121
##        truth
## pred      0   1
##   FALSE 136  59
##   TRUE   50 119
##        truth
## pred      0   1
##   FALSE 131  64
##   TRUE   55 114
##        truth
## pred      0   1
##   FALSE 129  55
##   TRUE   57 123
##        truth
## pred      0   1
##   FALSE 137  55
##   TRUE   49 123
##        truth
## pred      0   1
##   FALSE 139  59
##   TRUE   47 119
##        truth
## pred      0   1
##   FALSE 152  57
##   TRUE   34 121
##        truth
## pred      0   1
##   FALSE 136  57
##   TRUE   50 121
##        truth
## pred      0   1
##   FALSE 136  67
##   TRUE   50 111
##        truth
## pred      0   1
##   FALSE 142  65
##   TRUE   44 113
models[[9]]
## 
## Call:  glm(formula = formula, family = "binomial", data = train[-k_folds[[.x]], 
##     ])
## 
## Coefficients:
##                      (Intercept)             total_fines_by_parcel  
##                       -8.676e-01                         8.604e-05  
##   number_of_violations_by_parcel            improve_issues_tallies  
##                        1.088e-01                         2.103e-01  
## earlier_property_crime_incidents   earlier_violent_crime_incidents  
##                       -5.042e-03                         7.156e-03  
##                       total_acre                       council_di2  
##                        8.242e-02                        -2.499e-01  
##                      council_di3                       council_di4  
##                       -1.290e-01                         4.501e-03  
##                      council_di5                       council_di6  
##                       -5.813e-01                        -3.402e-01  
##                      council_di7                          frontage  
##                       -1.785e-01                        -5.867e-03  
##               num_vacant_parcels       num_nearby_blighted_parcels  
##                        1.444e-02                         1.820e-01  
##    num_violations_nearby_parcels  
##                       -2.572e-03  
## 
## Degrees of Freedom: 3276 Total (i.e. Null);  3260 Residual
## Null Deviance:       4541 
## Residual Deviance: 3671  AIC: 3705

Now try random forest:

models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7269938
sd(as.numeric(accuracies))
## [1] 0.02106707
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 135  42
##   TRUE   52 136
##        truth
## pred      0   1
##   FALSE 131  41
##   TRUE   55 137
##        truth
## pred      0   1
##   FALSE 127  50
##   TRUE   59 128
##        truth
## pred      0   1
##   FALSE 131  43
##   TRUE   55 135
##        truth
## pred      0   1
##   FALSE 127  37
##   TRUE   59 141
##        truth
## pred      0   1
##   FALSE 130  52
##   TRUE   56 126
##        truth
## pred      0   1
##   FALSE 134  49
##   TRUE   52 129
##        truth
## pred      0   1
##   FALSE 138  38
##   TRUE   48 140
##        truth
## pred      0   1
##   FALSE 133  57
##   TRUE   53 121
##        truth
## pred      0   1
##   FALSE 139  49
##   TRUE   47 129
models[[3]]
## 
## Call:
##  randomForest(formula = formula, data = train[-k_folds[[.x]],      ]) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 28.14%
## Confusion matrix:
##      0    1 class.error
## 0 1178  497   0.2967164
## 1  425 1177   0.2652934